package dswork.analyzer.core;

/**
 * 源代码由林良益(linliangyi2005@gmail.com)提供
 * 版权声明 2012，乌龙茶工作室
 * provided by Linliangyi and copyright 2012 by Oolong studio
 * 词元对象
 */
public class Lexeme implements Comparable<Lexeme>
{
	// lexemeType常量
	// 未知
	public static final int TYPE_UNKNOWN = 0;
	// 英文
	public static final int TYPE_ENGLISH = 1;
	// 数字
	public static final int TYPE_ARABIC = 2;
	// 英文数字混合
	public static final int TYPE_LETTER = 3;
	// 中文词元
	public static final int TYPE_CNWORD = 4;
	// 中文单字
	public static final int TYPE_CNCHAR = 64;
	// 日韩文字
	public static final int TYPE_OTHER_CJK = 8;
	// 中文数词
	public static final int TYPE_CNNUM = 16;
	// 中文量词
	public static final int TYPE_COUNT = 32;
	// 中文数量词
	public static final int TYPE_CQUAN = 48;
	// 词元的起始位移
	private int offset;
	// 词元的相对起始位置
	private int begin;
	// 词元的长度
	private int length;
	// 词元文本
	private String lexemeText;
	// 词元类型
	private int lexemeType;

	public Lexeme(int offset, int begin, int length, int lexemeType)
	{
		this.offset = offset;
		this.begin = begin;
		if(length < 0)
		{
			throw new IllegalArgumentException("length < 0");
		}
		this.length = length;
		this.lexemeType = lexemeType;
	}

	/*
	 * 判断词元相等算法
	 * 起始位置偏移、起始位置、终止位置相同
	 * @see java.lang.Object#equals(Object o)
	 */
	public boolean equals(Object o)
	{
		if(o == null)
		{
			return false;
		}
		if(this == o)
		{
			return true;
		}
		if(o instanceof Lexeme)
		{
			Lexeme other = (Lexeme) o;
			if(this.offset == other.getOffset() && this.begin == other.getBegin() && this.length == other.getLength())
			{
				return true;
			}
			else
			{
				return false;
			}
		}
		else
		{
			return false;
		}
	}

	/*
	 * 词元哈希编码算法
	 * @see java.lang.Object#hashCode()
	 */
	public int hashCode()
	{
		int absBegin = getBeginPosition();
		int absEnd = getEndPosition();
		return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
	}

	/*
	 * 词元在排序集合中的比较算法
	 * @see java.lang.Comparable#compareTo(java.lang.Object)
	 */
	public int compareTo(Lexeme other)
	{
		// 起始位置优先
		if(this.begin < other.getBegin())
		{
			return -1;
		}
		else if(this.begin == other.getBegin())
		{
			// 词元长度优先
			if(this.length > other.getLength())
			{
				return -1;
			}
			else if(this.length == other.getLength())
			{
				return 0;
			}
			else
			{// this.length < other.getLength()
				return 1;
			}
		}
		else
		{// this.begin > other.getBegin()
			return 1;
		}
	}

	public int getOffset()
	{
		return offset;
	}

	public void setOffset(int offset)
	{
		this.offset = offset;
	}

	public int getBegin()
	{
		return begin;
	}

	/**
	 * 获取词元在文本中的起始位置
	 * @return int
	 */
	public int getBeginPosition()
	{
		return offset + begin;
	}

	public void setBegin(int begin)
	{
		this.begin = begin;
	}

	/**
	 * 获取词元在文本中的结束位置
	 * @return int
	 */
	public int getEndPosition()
	{
		return offset + begin + length;
	}

	/**
	 * 获取词元的字符长度
	 * @return int
	 */
	public int getLength()
	{
		return this.length;
	}

	public void setLength(int length)
	{
		if(this.length < 0)
		{
			throw new IllegalArgumentException("length < 0");
		}
		this.length = length;
	}

	/**
	 * 获取词元的文本内容
	 * @return String
	 */
	public String getLexemeText()
	{
		if(lexemeText == null)
		{
			return "";
		}
		return lexemeText;
	}

	public void setLexemeText(String lexemeText)
	{
		if(lexemeText == null)
		{
			this.lexemeText = "";
			this.length = 0;
		}
		else
		{
			this.lexemeText = lexemeText;
			this.length = lexemeText.length();
		}
	}

	/**
	 * 获取词元类型
	 * @return int
	 */
	public int getLexemeType()
	{
		return lexemeType;
	}

	/**
	 * 获取词元类型标示字符串
	 * @return String
	 */
	public String getLexemeTypeString()
	{
		switch(lexemeType)
		{
			case TYPE_ENGLISH:
				return "ENGLISH";
			case TYPE_ARABIC:
				return "ARABIC";
			case TYPE_LETTER:
				return "LETTER";
			case TYPE_CNWORD:
				return "CN_WORD";
			case TYPE_CNCHAR:
				return "CN_CHAR";
			case TYPE_OTHER_CJK:
				return "OTHER_CJK";
			case TYPE_COUNT:
				return "COUNT";
			case TYPE_CNNUM:
				return "CN_NUM";
			case TYPE_CQUAN:
				return "TYPE_CQUAN";
			default:
				return "UNKONW";
		}
	}

	public void setLexemeType(int lexemeType)
	{
		this.lexemeType = lexemeType;
	}

	/**
	 * 合并两个相邻的词元
	 * @param l Lexeme
	 * @param lexemeType 类型
	 * @return boolean 词元是否成功合并
	 */
	public boolean append(Lexeme l, int lexemeType)
	{
		if(l != null && this.getEndPosition() == l.getBeginPosition())
		{
			this.length += l.getLength();
			this.lexemeType = lexemeType;
			return true;
		}
		else
		{
			return false;
		}
	}

	/**
	 * 格式化字符串
	 */
	public String toString()
	{
		StringBuffer strbuf = new StringBuffer();
		strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
		strbuf.append(" : ").append(this.lexemeText).append(" : \t");
		strbuf.append(this.getLexemeTypeString());
		return strbuf.toString();
	}
}
